started: 2016
last updated: 19Jan2017
Source data includes
- the output generated by wes pipeline in nov2016
- table for samples info, including wes-gwas correspondence and pass/fail info about samples in wes
- several tables with phenotype data, obtained from DC at different times
- table with cases where pathogenic BRCA1, BRCA2 or PALB2 variants were detected
Importantly, exac and kgen data may not be correct (especially for the retained multiallelic sites) because GATK variant-to-table tool did not handle multiallelic sites correctly at the time of data generation.
# Time stamp
Sys.time()
[1] "2017-01-19 20:43:12 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
source_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/source_data"
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"
Done only once. Hence FALSE in copying.
src_folder <- "/scratch/medgen/users/alexey/wecare_3_nov2016/wecare_nfe_nov2016_vqsr_shf_sma_ann_txt"
tgt_folder <- source_data_folder
prefix="wecare_nfe_nov2016_vqsr_shf_sma_ann"
gt_file <- paste(prefix,"GT_add.txt",sep="_")
gq_file <- paste(prefix,"GQ.txt",sep="_")
dp_file <- paste(prefix,"DP.txt",sep="_")
vv_file <- paste(prefix,"VV.txt",sep="_")
exac_file <- paste(prefix,"exac.txt",sep="_")
kgen_file <- paste(prefix,"kgen.txt",sep="_")
file.copy(
paste(src_folder, gt_file, sep="/"),
paste(tgt_folder, gt_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, gq_file, sep="/"),
paste(tgt_folder, gq_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, dp_file, sep="/"),
paste(tgt_folder, dp_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, vv_file, sep="/"),
paste(tgt_folder, vv_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, exac_file, sep="/"),
paste(tgt_folder, exac_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, kgen_file, sep="/"),
paste(tgt_folder, kgen_file, sep="/"))
[1] FALSE
# Clean-up
rm(src_folder, tgt_folder)
Done only once. Hence FALSE in copying.
Files with phenotypes updates (Dec2016) and cases with BRCA1, BRCA2 and PALB2 (Dec 2016) were added manually.
src_folder <- "/scratch/medgen/users/alexey/wecare_stat_2_aug2016/source_data"
tgt_folder <- source_data_folder
covar_file <- "covar.txt"
samples_file <- "samples_ids.txt"
demographics_file <- "WECARE.Exome.DemographicVariables.txt"
file.copy(
paste(src_folder, covar_file, sep="/"),
paste(tgt_folder, covar_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, samples_file, sep="/"),
paste(tgt_folder, samples_file, sep="/"))
[1] FALSE
file.copy(
paste(src_folder, demographics_file, sep="/"),
paste(tgt_folder, demographics_file, sep="/"))
[1] FALSE
# Clean-up
rm(src_folder, tgt_folder)
gt.df <- read.table(
paste(source_data_folder, gt_file, sep="/"),
header=TRUE, row.names=1, sep="\t", quote="")
gq.df <- read.table(
paste(source_data_folder, gq_file, sep="/"),
header=TRUE, row.names=1, sep="\t", quote="")
dp.df <- read.table(
paste(source_data_folder, dp_file, sep="/"),
header=TRUE, row.names=1, sep="\t", quote="")
covar.df <- read.table(
paste(source_data_folder, covar_file, sep="/"),
sep="\t", header=TRUE, quote="")
samples.df <- read.table(
paste(source_data_folder, samples_file, sep="/"),
sep="\t", header=TRUE, quote="")
demographics.df <- read.table(
paste(source_data_folder, demographics_file, sep="/"),
sep="\t", header=TRUE, quote="")
vv.df <- read.table(
paste(source_data_folder, vv_file, sep="/"),
header=TRUE, sep="\t", quote="")
kgen.df <- read.table(
paste(source_data_folder, kgen_file, sep="/"),
header=TRUE, sep="\t", quote="")
exac.df <- read.table(
paste(source_data_folder, exac_file, sep="/"),
header=TRUE, sep="\t", quote="")
phenotypes_update_file <- "phenotypes_update_06Dec2016.txt"
phenotypes_update.df <- read.table(
paste(source_data_folder, phenotypes_update_file, sep="/"),
header=TRUE, sep="\t", quote="")
BRCA1_BRCA2_PALB2_cases_file <- "cases_with_BRCA1_BRCA2_PALB2_pathogenic_variants.txt"
BRCA1_BRCA2_PALB2_cases.df <- read.table(
paste(source_data_folder, BRCA1_BRCA2_PALB2_cases_file, sep="/"),
header=TRUE, sep="\t", quote="")
# Update rownames, when necessary
rownames(vv.df) <- vv.df[,1]
rownames(kgen.df) <- kgen.df[,1]
rownames(exac.df) <- exac.df[,1]
rownames(BRCA1_BRCA2_PALB2_cases.df) <- BRCA1_BRCA2_PALB2_cases.df[,1]
# Update colnames, when necessary
colnames(gt.df) <- sub(".GT", "", colnames(gt.df))
colnames(gq.df) <- sub(".GQ", "", colnames(gq.df))
colnames(dp.df) <- sub(".DP", "", colnames(dp.df))
# Clean-up
rm(source_data_folder, prefix, covar_file, demographics_file, samples_file, vv_file, gt_file, gq_file, dp_file, kgen_file, exac_file, phenotypes_update_file, BRCA1_BRCA2_PALB2_cases_file)
dim(gt.df)
[1] 343824 710
str(gt.df, list.len=5)
'data.frame': 343824 obs. of 710 variables:
$ HG00097 : int NA 0 0 0 0 0 0 0 0 0 ...
$ HG00099 : int 0 0 0 0 NA NA NA NA 0 0 ...
$ HG00100 : int 0 NA 0 0 0 0 0 0 0 0 ...
$ HG00102 : int NA NA 0 0 0 0 0 0 0 0 ...
$ HG00106 : int NA 0 0 0 0 0 0 0 0 0 ...
[list output truncated]
gt.df[1:5,1:5]
dim(gq.df)
[1] 343824 710
str(gq.df, list.len=5)
'data.frame': 343824 obs. of 710 variables:
$ HG00097 : int NA 99 36 36 36 36 36 36 36 99 ...
$ HG00099 : int 9 99 99 99 NA NA NA NA 99 48 ...
$ HG00100 : int 3 NA 21 18 9 9 9 75 57 18 ...
$ HG00102 : int NA NA 12 3 99 93 93 72 72 72 ...
$ HG00106 : int NA 58 21 21 33 33 33 33 33 33 ...
[list output truncated]
gq.df[1:5,1:5]
dim(dp.df)
[1] 343824 710
str(dp.df, list.len=5)
'data.frame': 343824 obs. of 710 variables:
$ HG00097 : int 0 74 13 13 28 28 28 28 28 37 ...
$ HG00099 : int 4 169 35 35 NA NA NA NA 40 18 ...
$ HG00100 : int 1 0 7 7 10 10 10 33 22 8 ...
$ HG00102 : int 0 0 4 1 37 32 32 27 27 24 ...
$ HG00106 : int NA 130 8 8 20 20 20 20 20 20 ...
[list output truncated]
dp.df[1:5,1:5]
dim(covar.df)
[1] 498 34
str(covar.df)
'data.frame': 498 obs. of 34 variables:
$ Subject_ID : int 200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
$ setno : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
$ cc : int 0 0 0 0 0 0 1 1 0 0 ...
$ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
$ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
$ chemo_self_mra : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone_self_mra: int 0 1 1 0 1 0 0 1 1 0 ...
$ treatment : int 1 1 1 0 1 1 1 1 1 1 ...
$ ID : int 2 6 7 9 11 12 16 22 24 26 ...
$ labid : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ status : int 0 0 0 0 0 0 1 1 0 0 ...
$ status2 : int 0 0 0 0 0 0 1 1 0 0 ...
$ offset : num 6.41 5.82 5.61 4.03 4.77 ...
$ sub_dx_age : int 46 50 46 44 43 39 45 40 43 36 ...
$ XRTBreast : int 0 0 0 1 0 1 0 0 1 0 ...
$ Eigen_1 : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2 : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3 : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4 : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5 : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
$ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
$ good_location : int 1 1 0 1 1 1 0 1 0 0 ...
$ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
$ registry : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
$ race : int 0 0 0 0 0 0 0 0 0 0 ...
$ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
$ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
$ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
$ family_history : int 0 0 0 0 0 0 1 0 0 0 ...
$ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
$ CMF : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
$ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
dim(samples.df)
[1] 512 4
str(samples.df)
'data.frame': 512 obs. of 4 variables:
$ wes_id : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
$ gwas_id : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
$ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
$ filter : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
dim(demographics.df)
[1] 505 91
str(demographics.df)
'data.frame': 505 obs. of 91 variables:
$ Subject_ID : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ ID.x : num 2 6 7 9 11 12 NA NA 24 26 ...
$ labid.x : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
$ Eigen_1.x : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2.x : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3.x : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4.x : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5.x : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ Phase : num 1 1 1 1 1 1 NA NA 1 1 ...
$ setno.x : num 382125 204356 360683 226881 357431 ...
$ cc.x : int 0 0 0 0 0 0 NA NA 0 0 ...
$ rstime : num 7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
$ registry.x : Factor w/ 7 levels "","IA","IR","LA",..: 6 3 3 5 5 4 NA NA 2 6 ...
$ race.x : num 0 0 0 0 0 0 NA NA 0 0 ...
$ offset.x : num 6.41 5.82 5.61 4.03 4.77 ...
$ DOB : num -5049 -7459 -3916 -5890 -6407 ...
$ sub_dx_age.x : num 46 50 46 44 43 39 NA NA 43 36 ...
$ refage : num 53 59 51 50 52 45 NA NA 48 44 ...
$ BMI_age18 : num 20.2 19.7 23.3 19.5 25.8 ...
$ BMI_dx : num 22.8 20.9 23.3 32.6 25.8 ...
$ BMI_ref : num 25.7 22 25.8 32.6 28.1 ...
$ hormone_self_mra.x : num 0 1 1 0 1 0 NA NA 1 0 ...
$ chemo_self_mra.x : num 1 1 0 0 1 1 NA NA 1 1 ...
$ treatment.x : num 1 1 1 0 1 1 NA NA 1 1 ...
$ family_history.x : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
$ rh_age_menarche : num 13 14 13 13 12 9 NA NA 12 11 ...
$ age_menopause_1yrbf_fd: num -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
$ age_1fftp_fd : num 20 22 0 29 28 0 NA NA 27 24 ...
$ Num_ftp_fd : num 1 2 -1 2 2 -1 NA NA 3 2 ...
$ Histology : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
$ Histology1 : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
$ Hist_lob_other : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
$ stage_fd : num 2 1 1 1 2 2 NA NA 2 2 ...
$ er_fd : num 1 1 1 4 1 2 NA NA 2 2 ...
$ pr_fd : num 1 1 1 2 1 2 NA NA 1 0 ...
$ histo1_cat : Factor w/ 9 levels "Tubular/mucinous",..: 9 4 4 6 4 4 NA NA 4 4 ...
$ er1_cat : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
$ pr1_cat : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
$ status.x : num 0 0 0 0 0 0 NA NA 0 0 ...
$ status2.x : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
$ sub_race : num 0 0 0 0 0 0 NA NA 0 0 ...
$ er1_num : num 1 1 1 NA 1 0 NA NA 0 0 ...
$ er1 : int 1 1 1 NA 1 0 NA NA 0 0 ...
$ horm_tmx : num 0 1 1 0 2 0 NA NA 1 0 ...
$ XRTBreast.x : num 0 0 0 1 0 1 NA NA 1 0 ...
$ dose_caseloc : num 0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
$ good_location.x : num 1 1 0 1 1 1 NA NA 0 0 ...
$ avedose : num 0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
$ tmx : num 0 1 1 0 0 0 NA NA 1 0 ...
$ num_preg : num 1 1 0 1 1 0 NA NA 1 1 ...
$ fam_hist : num 0 0 0 0 0 0 NA NA 0 0 ...
$ age_menarche : int 1 1 1 1 0 0 NA NA 0 0 ...
$ lobular : num 0 0 0 0 0 0 NA NA 0 0 ...
$ age_menopause : int 0 2 0 0 0 0 NA NA 0 0 ...
$ conf_miss : num 0 0 0 0 0 0 NA NA 0 0 ...
$ dose_num : num 0 0 0 1 NA 1 NA NA 1 0 ...
$ cmf : Factor w/ 6 levels "","\024\x9c@",..: 3 4 5 5 3 3 NA NA 3 3 ...
$ cmf_012 : num 1 2 0 0 1 1 NA NA 1 1 ...
$ setno.y : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
$ cc.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
$ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
$ chemo_self_mra.y : int 1 1 0 0 1 1 1 1 1 1 ...
$ hormone_self_mra.y : int 0 1 1 0 1 0 0 1 1 0 ...
$ treatment.y : int 1 1 1 0 1 1 1 1 1 1 ...
$ ID.y : int 2 6 7 9 11 12 16 22 24 26 ...
$ labid.y : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
$ status.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ status2.y : int 0 0 0 0 0 0 1 1 0 0 ...
$ offset.y : num 6.41 5.82 5.61 4.03 4.77 ...
$ sub_dx_age.y : int 46 50 46 44 43 39 45 40 43 36 ...
$ XRTBreast.y : int 0 0 0 1 0 1 0 0 1 0 ...
$ Eigen_1.y : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
$ Eigen_2.y : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
$ Eigen_3.y : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
$ Eigen_4.y : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
$ Eigen_5.y : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
$ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
$ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
$ good_location.y : int 1 1 0 1 1 1 0 1 0 0 ...
$ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
$ registry.y : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
$ race.y : int 0 0 0 0 0 0 0 0 0 0 ...
$ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
$ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
$ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
$ family_history.y : int 0 0 0 0 0 0 1 0 0 0 ...
$ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
$ CMF : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
$ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
dim(phenotypes_update.df)
[1] 13 22
str(phenotypes_update.df)
'data.frame': 13 obs. of 22 variables:
$ gwas_id : Factor w/ 13 levels "id201558","id201921",..: 11 8 7 9 3 6 10 5 4 13 ...
$ wes_id : Factor w/ 13 levels "P1_C02","P1_C04",..: 1 2 3 4 5 6 7 8 9 10 ...
$ cc : int 0 0 0 0 1 0 1 1 0 0 ...
$ setno : int 227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
$ family_history: int 1 1 1 0 1 0 1 1 0 0 ...
$ age_dx : int 30 30 31 30 24 23 31 33 49 40 ...
$ age_ref : int 39 33 32 34 28 27 32 37 54 47 ...
$ rstime : num 9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
$ Eigen_1 : num -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
$ Eigen_2 : num 0.00615 0.00853 0.00138 0.00428 0.00317 ...
$ Eigen_3 : num -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
$ Eigen_4 : num -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
$ Eigen_5 : num -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
$ stage : int 1 1 2 2 2 1 2 1 1 1 ...
$ er1 : Factor w/ 3 levels "0","1","missing": 1 2 1 2 1 2 2 3 2 3 ...
$ pr1 : Factor w/ 3 levels "0","1","missing": 1 2 3 2 1 2 1 3 2 3 ...
$ hist_cat : Factor w/ 3 levels "ductal ",..: 3 1 1 1 1 3 2 1 1 1 ...
$ hormone : Factor w/ 3 levels "0","1","missing": 1 1 1 1 1 1 1 1 3 1 ...
$ chemo_cat : Factor w/ 3 levels "CMF","Oth","no ": 1 3 2 3 2 2 1 2 2 2 ...
$ br_xray : int 1 0 0 0 0 1 0 1 0 1 ...
$ br_xray_dose : num 1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
$ num_preg : int 2 0 1 1 1 0 0 1 1 0 ...
phenotypes_update.df[1:5,1:5]
dim(BRCA1_BRCA2_PALB2_cases.df)
[1] 11 12
str(BRCA1_BRCA2_PALB2_cases.df)
'data.frame': 11 obs. of 12 variables:
$ Cases_wes_id : Factor w/ 11 levels "P1_A09","P1_C02",..: 1 2 3 4 5 6 7 8 9 10 ...
$ Cases_gwas_id: Factor w/ 11 levels "id204830","id247333",..: 8 9 3 1 6 2 4 11 5 7 ...
$ SYMBOL : Factor w/ 3 levels "BRCA1","BRCA2",..: 1 3 1 1 2 1 2 2 3 3 ...
$ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 1 2 2 2 1 1 1 2 2 ...
$ CHROM : int 17 16 17 17 13 17 13 13 16 16 ...
$ POS.GRCh37 : int 41256985 23641422 41243800 41258504 32936732 41276044 32914437 32914437 23625324 23634452 ...
$ REF : Factor w/ 6 levels "A","C","G","GT",..: 5 6 2 1 3 1 4 4 2 2 ...
$ ALT : Factor w/ 5 levels "A","ACT","C",..: 3 5 1 3 3 2 4 4 4 4 ...
$ rs_id : Factor w/ 9 levels "rs28897672","rs28897686",..: 7 5 2 1 8 6 9 9 4 3 ...
$ Consequence : Factor w/ 6 levels "frameshift_variant",..: 2 1 6 3 3 1 1 1 5 4 ...
$ wes_var_id : Factor w/ 9 levels "Var000223294",..: 7 5 6 8 2 9 1 1 3 4 ...
$ Note : Factor w/ 2 levels "","Added for QC": 1 1 2 2 1 2 1 1 1 1 ...
BRCA1_BRCA2_PALB2_cases.df[1:5,1:5]
dim(vv.df)
[1] 343824 35
str(vv.df)
'data.frame': 343824 obs. of 35 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ TYPE : Factor w/ 2 levels "INDEL","SNP": 1 2 2 2 2 2 2 2 2 2 ...
$ ID : Factor w/ 253381 levels "rs10000804","rs10000924",..: NA 234676 201998 253336 54527 NA NA 162750 NA NA ...
$ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
$ POS : int 664486 762330 865628 865694 871159 871171 871173 871215 871230 871271 ...
$ REF : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1653 1044 1044 539 1044 1 539 539 539 539 ...
$ ALT : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 989 1 989 1 664 1 664 989 989 ...
$ QUAL : num 35191 7081 6420 991 649 ...
$ DP : int 49355 26720 14628 8357 11648 12903 13516 16282 15415 10748 ...
$ AS_VQSLOD : num 1.57 7.04 16.2 18.18 16.05 ...
$ FILTER : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
$ AC : int 53 8 7 4 1 1 1 5 1 1 ...
$ AF : num 0.044 0.00622 0.00519 0.00318 0.00082 ...
$ AN : int 1206 1286 1348 1258 1220 1250 1258 1312 1310 1294 ...
$ NEGATIVE_TRAIN_SITE: logi NA NA NA NA NA NA ...
$ POSITIVE_TRAIN_SITE: logi NA NA NA TRUE NA NA ...
$ SYMBOL : Factor w/ 19874 levels "A1BG","A1CF",..: 14360 9127 14922 14922 14922 14922 14922 14922 14922 14922 ...
$ Allele : Factor w/ 1048 levels "-","A","AA","AAA",..: 1 790 2 790 2 533 2 533 790 790 ...
$ Existing_variation : Factor w/ 307403 levels "","1KG_2_227731951",..: 1 255726 203059 307355 55938 218187 1 163633 277286 1 ...
$ Consequence : Factor w/ 80 levels "3_prime_UTR_variant",..: 80 32 28 28 28 28 78 78 78 28 ...
$ IMPACT : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 4 4 3 3 3 3 2 2 2 3 ...
$ CLIN_SIG : Factor w/ 93 levels "association",..: NA NA NA NA NA NA NA NA NA NA ...
$ cDNA_position : Factor w/ 16588 levels "0-1","1","1-18",..: NA 11562 6108 7438 8899 9105 9146 9809 10047 10653 ...
$ CDS_position : Factor w/ 15333 levels "1","1-18","1-2",..: NA NA 3598 5147 6675 6879 6912 7603 7853 8506 ...
$ Codons : Factor w/ 3014 levels "-/A","-/AA","-/AAAA",..: NA NA 698 472 690 319 1167 1753 1848 1532 ...
$ Protein_position : Factor w/ 7359 levels "1","1-2","1-6",..: NA NA 5937 6741 125 216 216 514 620 910 ...
$ Amino_acids : Factor w/ 1851 levels "*","*/*EX","*/C",..: NA NA 589 695 589 1586 1583 1100 1329 1107 ...
$ DISTANCE : int 960 NA NA NA NA NA NA NA NA NA ...
$ STRAND : int -1 -1 1 1 1 1 1 1 1 1 ...
$ SYMBOL_SOURCE : Factor w/ 6 levels "Clone_based_ensembl_gene",..: 2 3 3 3 3 3 3 3 3 3 ...
$ SIFT_call : Factor w/ 4 levels "deleterious",..: NA NA 2 1 3 1 NA NA NA 4 ...
$ SIFT_score : num NA NA 0.01 0.04 1 0.05 NA NA NA 0.09 ...
$ PolyPhen_call : Factor w/ 4 levels "benign","possibly_damaging",..: NA NA 1 2 1 1 NA NA NA 1 ...
$ PolyPhen_score : num NA NA 0.099 0.493 0.002 0.018 NA NA NA 0.045 ...
$ Multiallelic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
vv.df[1:5,1:5]
dim(kgen.df)
[1] 343824 9
str(kgen.df)
'data.frame': 343824 obs. of 9 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ kgen.AC : int NA NA 14 263 2 1 NA NA NA NA ...
$ kgen.AN : int NA NA 5008 5008 5008 5008 NA NA NA NA ...
$ kgen.AF : num NA NA 0.002796 0.052516 0.000399 ...
$ kgen.AFR_AF: num NA NA 0 0.0227 0 0 NA NA NA NA ...
$ kgen.AMR_AF: num NA NA 0.0072 0.0965 0 0 NA NA NA NA ...
$ kgen.EAS_AF: num NA NA 0 0.139 0 ...
$ kgen.EUR_AF: num NA NA 0.005 0.002 0.002 0.001 NA NA NA NA ...
$ kgen.SAS_AF: num NA NA 0.0041 0.0245 0 0 NA NA NA NA ...
kgen.df[1:5,1:5]
dim(exac.df)
[1] 343824 48
str(exac.df)
'data.frame': 343824 obs. of 48 variables:
$ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
$ exac_non_TCGA.AF : num NA 0.012 0.00278 0.025 0.00231 ...
$ exac_non_TCGA.AC : int NA 175 295 2670 245 NA NA NA NA NA ...
$ exac_non_TCGA.AN : int NA 14012 106020 106038 106210 NA NA NA NA NA ...
$ exac_non_TCGA.AC_FEMALE: int NA 59 94 1232 97 NA NA 1553 NA NA ...
$ exac_non_TCGA.AN_FEMALE: int NA 3452 18598 24670 45846 NA NA 45840 NA NA ...
$ exac_non_TCGA.AC_MALE : int NA 91 170 1159 148 NA NA 1483 NA NA ...
$ exac_non_TCGA.AN_MALE : int NA 8250 26380 33440 60312 NA NA 60304 NA NA ...
$ exac_non_TCGA.AC_Adj : int NA 150 264 2391 245 NA NA 3036 NA NA ...
$ exac_non_TCGA.AN_Adj : int NA 11702 44978 58110 106158 NA NA 106144 NA NA ...
$ exac_non_TCGA.AC_Hom : int NA 0 1 102 2 NA NA 136 NA NA ...
$ exac_non_TCGA.AC_Het : int NA 150 262 2187 241 NA NA 2760 NA NA ...
$ exac_non_TCGA.AC_Hemi : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_AFR : int NA 85 6 172 0 NA NA 263 NA NA ...
$ exac_non_TCGA.AN_AFR : int NA 422 4194 5062 9054 NA NA 9040 NA NA ...
$ exac_non_TCGA.Hom_AFR : int NA 0 0 2 0 NA NA 6 NA NA ...
$ exac_non_TCGA.Het_AFR : int NA 85 6 168 0 NA NA 251 NA NA ...
$ exac_non_TCGA.Hemi_AFR : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_AMR : int NA 2 20 979 0 NA NA 1180 NA NA ...
$ exac_non_TCGA.AN_AMR : int NA 224 3570 6030 11216 NA NA 11210 NA NA ...
$ exac_non_TCGA.Hom_AMR : int NA 0 0 46 0 NA NA 67 NA NA ...
$ exac_non_TCGA.Het_AMR : int NA 2 20 887 0 NA NA 1046 NA NA ...
$ exac_non_TCGA.Hemi_AMR : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_EAS : int NA 0 0 792 0 NA NA 889 NA NA ...
$ exac_non_TCGA.AN_EAS : int NA 314 3432 4930 7866 NA NA 7854 NA NA ...
$ exac_non_TCGA.Hom_EAS : int NA 0 0 47 0 NA NA 52 NA NA ...
$ exac_non_TCGA.Het_EAS : int NA 0 0 698 0 NA NA 785 NA NA ...
$ exac_non_TCGA.Hemi_EAS : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_FIN : int NA 0 9 4 149 NA NA 39 NA NA ...
$ exac_non_TCGA.AN_FIN : int NA 30 1376 1960 6614 NA NA 6614 NA NA ...
$ exac_non_TCGA.Hom_FIN : int NA 0 0 0 2 NA NA 0 NA NA ...
$ exac_non_TCGA.Het_FIN : int NA 0 9 4 145 NA NA 39 NA NA ...
$ exac_non_TCGA.Hemi_FIN : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_NFE : int NA 50 193 61 92 NA NA 172 NA NA ...
$ exac_non_TCGA.AN_NFE : int NA 2944 22156 28886 54312 NA NA 54328 NA NA ...
$ exac_non_TCGA.Hom_NFE : int NA 0 1 1 0 NA NA 2 NA NA ...
$ exac_non_TCGA.Het_NFE : int NA 50 191 59 92 NA NA 168 NA NA ...
$ exac_non_TCGA.Hemi_NFE : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_SAS : int NA 10 33 373 1 NA NA 478 NA NA ...
$ exac_non_TCGA.AN_SAS : int NA 7650 9972 10884 16402 NA NA 16404 NA NA ...
$ exac_non_TCGA.Hom_SAS : int NA 0 0 6 0 NA NA 9 NA NA ...
$ exac_non_TCGA.Het_SAS : int NA 10 33 361 1 NA NA 456 NA NA ...
$ exac_non_TCGA.Hemi_SAS : int NA NA NA NA NA NA NA NA NA NA ...
$ exac_non_TCGA.AC_OTH : int NA 3 3 10 3 NA NA 15 NA NA ...
$ exac_non_TCGA.AN_OTH : int NA 118 278 358 694 NA NA 694 NA NA ...
$ exac_non_TCGA.Hom_OTH : int NA 0 0 0 0 NA NA 0 NA NA ...
$ exac_non_TCGA.Het_OTH : int NA 3 3 10 3 NA NA 15 NA NA ...
$ exac_non_TCGA.Hemi_OTH : int NA NA NA NA NA NA NA NA NA NA ...
exac.df[1:5,1:5]
gt.mx <- as.matrix(gt.df)
gq.mx <- as.matrix(gq.df)
dp.mx <- as.matrix(dp.df)
dim(gt.mx)
[1] 343824 710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 NA 0 0 NA NA
Var000000002 0 0 NA NA 0
Var000000003 0 0 0 0 0
Var000000004 0 0 0 0 0
Var000000005 0 NA 0 0 0
dim(gq.mx)
[1] 343824 710
class(gq.mx)
[1] "matrix"
gq.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 NA 9 3 NA NA
Var000000002 99 99 NA NA 58
Var000000003 36 99 21 12 21
Var000000004 36 99 18 3 21
Var000000005 36 NA 9 99 33
dim(dp.mx)
[1] 343824 710
class(dp.mx)
[1] "matrix"
dp.mx[1:5,1:5]
HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001 0 4 1 0 NA
Var000000002 74 169 0 0 130
Var000000003 13 35 7 4 8
Var000000004 13 35 7 1 8
Var000000005 28 NA 10 37 20
rm(gt.df, gq.df, dp.df)
# rownames
sum(rownames(gt.mx) != rownames(gq.mx))
[1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
[1] 0
sum(rownames(gt.mx) != rownames(vv.df))
[1] 0
sum(rownames(gt.mx) != rownames(kgen.df))
[1] 0
sum(rownames(gt.mx) != rownames(exac.df))
[1] 0
# colnames
sum(colnames(gt.mx) != colnames(gq.mx))
[1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
[1] 0
NA -> vv.df[vv.df$Existing_variation == "", "Existing_variation"] # no blanks in other fields
NA -> covar.df[covar.df == ""] # 1 case in chemo_hormone (row 84)
NA -> demographics.df[demographics.df == ""] # 2 cases in registry and one in chemo_hormone
NA -> BRCA1_BRCA2_PALB2_cases.df[BRCA1_BRCA2_PALB2_cases.df == ""] # 8 cases in notes
# No blanks in other tables
# Converst factors to vectors (to symplify comparisons)
str(phenotypes_update.df)
'data.frame': 13 obs. of 22 variables:
$ gwas_id : Factor w/ 13 levels "id201558","id201921",..: 11 8 7 9 3 6 10 5 4 13 ...
$ wes_id : Factor w/ 13 levels "P1_C02","P1_C04",..: 1 2 3 4 5 6 7 8 9 10 ...
$ cc : int 0 0 0 0 1 0 1 1 0 0 ...
$ setno : int 227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
$ family_history: int 1 1 1 0 1 0 1 1 0 0 ...
$ age_dx : int 30 30 31 30 24 23 31 33 49 40 ...
$ age_ref : int 39 33 32 34 28 27 32 37 54 47 ...
$ rstime : num 9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
$ Eigen_1 : num -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
$ Eigen_2 : num 0.00615 0.00853 0.00138 0.00428 0.00317 ...
$ Eigen_3 : num -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
$ Eigen_4 : num -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
$ Eigen_5 : num -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
$ stage : int 1 1 2 2 2 1 2 1 1 1 ...
$ er1 : Factor w/ 3 levels "0","1","missing": 1 2 1 2 1 2 2 3 2 3 ...
$ pr1 : Factor w/ 3 levels "0","1","missing": 1 2 3 2 1 2 1 3 2 3 ...
$ hist_cat : Factor w/ 3 levels "ductal ",..: 3 1 1 1 1 3 2 1 1 1 ...
$ hormone : Factor w/ 3 levels "0","1","missing": 1 1 1 1 1 1 1 1 3 1 ...
$ chemo_cat : Factor w/ 3 levels "CMF","Oth","no ": 1 3 2 3 2 2 1 2 2 2 ...
$ br_xray : int 1 0 0 0 0 1 0 1 0 1 ...
$ br_xray_dose : num 1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
$ num_preg : int 2 0 1 1 1 0 0 1 1 0 ...
as.character(phenotypes_update.df$gwas_id) -> phenotypes_update.df$gwas_id
as.character(phenotypes_update.df$wes_id) -> phenotypes_update.df$wes_id
trimws(as.character(phenotypes_update.df$hist_cat)) -> phenotypes_update.df$hist_cat
trimws(as.character(phenotypes_update.df$chemo_cat)) -> phenotypes_update.df$chemo_cat
as.character(phenotypes_update.df$er1) -> phenotypes_update.df$er1
as.character(phenotypes_update.df$pr1) -> phenotypes_update.df$pr1
as.character(phenotypes_update.df$hormone) -> phenotypes_update.df$hormone
NA -> phenotypes_update.df[phenotypes_update.df$er1 == "missing", "er1"]
NA -> phenotypes_update.df[phenotypes_update.df$pr1 == "missing", "pr1"]
NA -> phenotypes_update.df[phenotypes_update.df$hormone == "missing", "hormone"]
as.numeric(phenotypes_update.df$er1) -> phenotypes_update.df$er1
as.numeric(phenotypes_update.df$pr1) -> phenotypes_update.df$pr1
as.numeric(phenotypes_update.df$hormone) -> phenotypes_update.df$hormone
str(phenotypes_update.df)
'data.frame': 13 obs. of 22 variables:
$ gwas_id : chr "id319270" "id298378" "id271981" "id301570" ...
$ wes_id : chr "P1_C02" "P1_C04" "P1_C06" "P1_C11" ...
$ cc : int 0 0 0 0 1 0 1 1 0 0 ...
$ setno : int 227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
$ family_history: int 1 1 1 0 1 0 1 1 0 0 ...
$ age_dx : int 30 30 31 30 24 23 31 33 49 40 ...
$ age_ref : int 39 33 32 34 28 27 32 37 54 47 ...
$ rstime : num 9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
$ Eigen_1 : num -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
$ Eigen_2 : num 0.00615 0.00853 0.00138 0.00428 0.00317 ...
$ Eigen_3 : num -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
$ Eigen_4 : num -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
$ Eigen_5 : num -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
$ stage : int 1 1 2 2 2 1 2 1 1 1 ...
$ er1 : num 0 1 0 1 0 1 1 NA 1 NA ...
$ pr1 : num 0 1 NA 1 0 1 0 NA 1 NA ...
$ hist_cat : chr "other carcinoma" "ductal" "ductal" "ductal" ...
$ hormone : num 0 0 0 0 0 0 0 0 NA 0 ...
$ chemo_cat : chr "CMF" "no" "Oth" "no" ...
$ br_xray : int 1 0 0 0 0 1 0 1 0 1 ...
$ br_xray_dose : num 1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
$ num_preg : int 2 0 1 1 1 0 0 1 1 0 ...
save.image(paste(interim_data_folder, "r01_read_and_clean_data_jan2017.RData", sep="/"))
ls()
[1] "BRCA1_BRCA2_PALB2_cases.df" "covar.df" "demographics.df" "dp.mx"
[5] "exac.df" "gq.mx" "gt.mx" "interim_data_folder"
[9] "kgen.df" "phenotypes_update.df" "samples.df" "vv.df"
sessionInfo()
R version 3.2.3 (2015-12-10)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Scientific Linux 7.2 (Nitrogen)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=C LC_COLLATE=C LC_MONETARY=C
[6] LC_MESSAGES=C LC_PAPER=C LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=C LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] backports_1.0.4 magrittr_1.5 rprojroot_1.1 htmltools_0.3.5 tools_3.2.3 base64enc_0.1-3 yaml_2.1.14
[8] Rcpp_0.12.8 stringi_1.1.2 rmarkdown_1.3 knitr_1.15.1 jsonlite_1.2 stringr_1.1.0 digest_0.6.11
[15] evaluate_0.10
Sys.time()
[1] "2017-01-19 20:51:50 GMT"